cd "C:\Users\jedwab\Desktop\Replication files for Esch et al 2024\Data Analysis in Stata"

*******************************************************************************************************************

* This do-file creates the main data sets for the analysis:
* country3Dfull: main file at the country level
* totpopsubregion: extra data set with total population at the UN subregion level today
* city3Dfull: main file at the city level

**************************
**************************
*** COUNTRY-LEVEL DATA ***
**************************
**************************

*****************************
*** COUNTRY-LEVEL 3D DATA ***
*****************************

* This is the data aggregated at the country level.  
* Date of creation: 10-17-2022
clear
import excel "Country_Stats_17102022.xlsx", sheet ("Merged") firstrow
count
* 235 countries. 
* We change some country names.
gen country_wb = NAME
replace country_wb = "Bahamas, The"	if country_wb == "Bahamas"
replace country_wb = "Brunei Darussalam"	if country_wb == "Brunei"
replace country_wb = "Cabo Verde"	if country_wb == "Cape Verde"
replace country_wb = "Channel Islands"	if country_wb == "Jersey"
replace country_wb = "Congo, Dem. Rep."	if country_wb == "Democratic Republic of the Congo"
replace country_wb = "Congo, Rep."	if country_wb == "Republic of Congo"
replace country_wb = "Cote d'Ivoire"	if country_wb == "CÃ´te d'Ivoire"
replace country_wb = "Curacao"	if country_wb == "CuraÃ§ao"
replace country_wb = "Egypt, Arab Rep."	if country_wb == "Egypt"
replace country_wb = "Eswatini"	if country_wb == "Swaziland"
replace country_wb = "Gambia, The"	if country_wb == "Gambia"
replace country_wb = "Hong Kong SAR, China"	if country_wb == "Hong Kong"
replace country_wb = "Iran, Islamic Rep."	if country_wb == "Iran"
replace country_wb = "Korea, Dem. People's Rep."	if country_wb == "North Korea"
replace country_wb = "Korea, Rep."	if country_wb == "South Korea"
replace country_wb = "Kyrgyz Republic"	if country_wb == "Kyrgyzstan"
replace country_wb = "Lao PDR"	if country_wb == "Laos"
replace country_wb = "Macao SAR, China"	if country_wb == "Macao"
replace country_wb = "North Macedonia"	if country_wb == "Macedonia"
replace country_wb = "Russian Federation"	if country_wb == "Russia"
replace country_wb = "Sao Tome and Principe"	if country_wb == "SÃ£o TomÃ© and PrÃ­ncipe"
replace country_wb = "Sint Maarten (Dutch part)"	if country_wb == "Sint Maarten"
replace country_wb = "Slovak Republic"	if country_wb == "Slovakia"
replace country_wb = "St. Kitts and Nevis"	if country_wb == "Saint Kitts and Nevis"
replace country_wb = "St. Lucia"	if country_wb == "Saint Lucia"
replace country_wb = "St. Martin (French part)"	if country_wb == "Saint-Martin"
replace country_wb = "St. Vincent and the Grenadines"	if country_wb == "Saint Vincent and the Grenadines"
replace country_wb = "Syrian Arab Republic"	if country_wb == "Syria"
replace country_wb = "Turkiye"	if country_wb == "Turkey"
replace country_wb = "Venezuela, RB"	if country_wb == "Venezuela"
replace country_wb = "Virgin Islands (U.S.)"	if country_wb == "Virgin Islands, U.S."
replace country_wb = "West Bank and Gaza"	if country_wb == "Palestina"
replace country_wb = "Yemen, Rep."	if country_wb == "Yemen"
* We rename some variables and label them. 
ren Total_2D stock2D
* V1 = old version of WSF3D
* V2 = new version of WSF3D 
ren Av_H_v1 avght_old
ren Av_H_v2 avght_new
ren Total_V_v1 stock3D_old
ren Total_V_v2 stock3D_new
* The GDP values = 0 are actually missing so we replace the 0s by missing values. 
replace GDP_2015 = . if GDP_2015 == 0
* We label the variables. 
label var GDP_2015 "Total GDP 2015 based on night lights"
label var stock2D "Built-up area 2012-2019 (sq m)"
label var stock3D_old "Volume 2012-2019 (cubic m; excl. tall buildings)"
label var stock3D_new "Volume 2012-2019 (cubic m; incl. tall buildings)"
label var avght_old "Average height 2012-2019 (m; excl. tall buildings)"
label var avght_new "Average height 2012-2019 (m; incl. tall buildings)"
* We create logs of the variables. 
foreach X of varlist stock* avght* {
gen l`X' = log(`X')
label var l`X' "Log of `X'"
}
sort country_wb
save country3D, replace

* We then add more variables. 

**********************
*** LAND AREA DATA ***
**********************

* Variable: Land area (sq. km)
* Source: World Bank's World Development Indicators database
* Last accessed: 10-05-2022
* Downloaded file: Data_Extract_From_World_Development_Indicators_land
* We modify the excel file manually so that it can be imported into Stata.
* Modified file: Data_Extract_From_World_Development_Indicators_land2
* We then import it into stata. 

clear
import excel "Data_Extract_From_World_Development_Indicators_land2.xlsx", sheet("Data") firstrow clear
* We only keep countries, not regions. 
keep if _n <= 217
* We use the year 2019 for the analysis. 
gen landareasqkm = y2019
keep countrycode countryname landarea
ren countrycode ccode
ren countryname country_wb 
tab country_wb if landarea == .
* Some countries have missing values.
* We use the Wikipedia webpage of these countries to obtain their values.
* From Wikipedia (last accessed 10-05-2022):
* Link for the Channel Islands:
* https://en.wikipedia.org/wiki/Channel_Islands
replace landarea = 198 if country_wb == "Channel Islands"
* https://en.wikipedia.org/wiki/Kosovo
replace landarea = 10887 if country_wb == "Kosovo"
sort country_wb 
save landarea, replace
count
* 217 countries

******************************************************
*** MAIN DATA SET WITH SOME WORLD BANK INFORMATION ***
******************************************************

* We merge with the main data set *
* We start with the country3D data set * 
use country3D, clear
* We add the land area from the World Bank data set.
sort country_wb 
merge country_wb using landarea
tab _m
tab country_wb  if _m == 1
tab country_wb  if _m == 2
* The unmatched ones (_m == 1) are small countries except Taiwan. 
* We keep all countries but will provide a pcGDP estimate for Taiwan. 
replace country_wb = NAME if _m == 1
drop _m
* We create a main "sample" dummy. 
* This sample includes countries in the World Bank list (or Taiwan) and with volume data: 
gen sample = (stock3D_new != . & (ccode != "" | country_wb == "Taiwan"))
tab sample
replace ccode = "TWN" if country_wb == "Taiwan"
* We also label the new country variables
label var country_wb "Country name in WB-WDI"
label var ccode "Country code in WB-WDI"
label var sample "Volume data and (present in WB-WDI or Taiwan)"
tab sample
* 214 countries belong to the main sample
* We now add the land area of Taiwan, using Wikipedia (last accessed 10-05-2022):
replace landarea = 35980 if country_wb == "Taiwan"
order ccode country_wb sample Country_ID NAME 
label var landareasqkm "Land area (sq km; WB-WDI)"
codebook landarea if sample == 1
sort country_wb
save country3Dfull, replace

*****************************
*** TOTAL POPULATION DATA ***
*****************************

* Population, total
* Source: World Bank's World Development Indicators database
* Last accessed: 10-05-2022
* Downloaded file: Data_Extract_From_World_Development_Indicators_pop
* We modify the excel file manually so that it can be imported into Stata.
* Modified file: Data_Extract_From_World_Development_Indicators_pop2
* We then import it into stata. 

clear
import excel "Data_Extract_From_World_Development_Indicators_pop2.xlsx", sheet("Data") firstrow clear
* We only keep countries, not regions. 
keep if _n <= 217
* We use the year 2019 for the analysis. 
gen totpop = y2019
gen totpop2015 = y2015
keep countrycode countryname totpop totpop2015
ren countrycode ccode
ren countryname country_wb 
tab country_wb if totpop == .
* Population is expressed in inhabitants
* For countries with missing values, we use the ones from Wikipedia. 
* From Wikipedia (last accessed 10-05-2022; circa 2019):
replace totpop = 6209262 if country_wb == "Eritrea"
replace totpop2015 = 6209262 if country_wb == "Eritrea"
sort country_wb 
save totpop, replace
count

* We merge with the main data set *
use country3Dfull, clear
sort country_wb 
merge country_wb using totpop
tab _m
tab country_wb if _m == 1
drop _m
* We add the population for Taiwan. 
* From Wikipedia (last accessed 10-05-2022; circa 2019):
replace totpop = 23777737 if country_wb == "Taiwan"
label var totpop "Total population c. 2019 (inh.; WB-WDI)"
label var totpop2015 "Total population c. 2015 (inh.; WB-WDI)"
* We create the log of total population.
gen ltotpop = log(totpop)
label var ltotpop "Log total population c. 2019 (inh.; WB-WDI)"
codebook totpop if sample == 1
save country3Dfull, replace

*************************
*** URBANIZATION RATE ***
*************************

* Urban population (% of total population)
* Source: World Bank's World Development Indicators database
* Last accessed: 10-05-2022
* Downloaded file: Data_Extract_From_World_Development_Indicators_urbshare
* We modify the excel file manually so that it can be imported into Stata.
* Modified file: Data_Extract_From_World_Development_Indicators_urbshare2
* We then import it into stata. 

clear
import excel "Data_Extract_From_World_Development_Indicators_urbshare2.xlsx", sheet("Data") firstrow clear
* We only keep countries, not regions. 
keep if _n <= 217
* We use the year 2019 for the analysis. 
gen urbshare = y2019
keep countrycode countryname urbshare
ren countrycode ccode
ren countryname country_wb 
tab country_wb if urbshare == .
sum urbshare, d
* For these countries, we use various sources to obtain the missing values.
* Eritrea:
* Url: World Urbanization Prospects: The 2018 Revision
* https://population.un.org/wup/
* Last accessed: 10-05-2022
replace urbshare = 41.3 if country_wb == "Eritrea"
* Kosovo
* Url: The percentage of the total population that is urban is about 50% (Kosovo Institute for Spatial Planning, 2018).
* https://unhabitat.org/kosovo#:~:text=The%20percentage%20of%20the%20total,for%20Spatial%20Planning%2C%202018).
* Last accessed: 10-05-2022
replace urbshare = 50 if country_wb == "Kosovo"
* We cannot find an estimate for "St Martin (French part)" and thus leave it as missing
replace urbshare = . if country_wb == "St. Martin (French part)"
sort country_wb 
save urbshare, replace
count

* We merge with the main data set *
use country3Dfull, clear
sort country_wb 
merge country_wb using urbshare
tab _m
tab country_wb if _m == 1
drop _m
* We now use Wikipedia to replace the estimate for Taiwan and St. Martin (French part)
* From Wikipedia (last accessed 10-05-2022; circa 2019):
replace urbshare = 78.9 if country_wb == "Taiwan"
replace urbshare = 80.0 if country_wb == "St. Martin (French part)"
label var urbshare "Urban share c. 2019 (%; WB-WDI)"
codebook urbshare if sample == 1
tab country_wb if sample == 1 & urbshare == .
save country3Dfull, replace

***************************
*** PER CAPITA GDP DATA ***
***************************

* GDP per capita, PPP (constant 2017 international $)
* Source: World Bank's World Development Indicators database
* Last accessed: 10-05-2022
* Downloaded file: Data_Extract_From_World_Development_Indicators_pcgdp
* We modify the excel file manually so that it can be imported into Stata.
* Modified file: Data_Extract_From_World_Development_Indicators_pcgdp2
* We then import it into stata. 

clear
import excel "Data_Extract_From_World_Development_Indicators_pcgdp2.xlsx", sheet("Data") firstrow clear
* We only keep countries, not regions. 
keep if _n <= 217
* We obtain pcGDP c. the years 2012 2015 and 2019
* For example, for 2012, we use 2012. If unavailable, we use the years around (within a three-year window).
gen pcgdp2012 = y2012  
foreach X in 2011 2013 2010 2014 2009 2015 {
replace pcgdp2012 = y`X' if pcgdp2012 == . 
}
* For 2015
gen pcgdp2015 = y2015  
foreach X in 2014 2016 2013 2017 2012 2018 {
replace pcgdp2015 = y`X' if pcgdp2015 == . 
}
* For 2019 (due to COVID-19, we ignore 2020-2022 and in order to capture more "permanent" economic conditions)
gen pcgdp2019 = y2019
foreach X in 2018 2017 2016 {
replace pcgdp2019 = y`X' if pcgdp2019 == . 
}
* Average 2010s
egen pcgdp2010s = rmean(y201*)
keep countrycode countryname pcgdp*
ren countrycode ccode
ren countryname country_wb 
sort country_wb 
save pcgdp, replace

* We merge with the main data set *
use country3Dfull, clear
sort country_wb 
merge country_wb using pcgdp
tab _m
tab country_wb if _m == 1
* Same countries as before
drop _m
* We check which sample countries have missing pcGDP data
tab country_wb if pcgdp2012 == . & sample == 1
tab country_wb if pcgdp2015 == . & sample == 1
tab country_wb if pcgdp2019 == . & sample == 1
* Among these, we only consider countries that are "large" enough in terms of land area. 
sum landareasqkm, d
* We consider countries with an area above 500 sq km
tab country_wb if pcgdp2012 == . & sample == 1 & landareasqkm > 500
tab country_wb if pcgdp2015 == . & sample == 1 & landareasqkm > 500
tab country_wb if pcgdp2019 == . & sample == 1 & landareasqkm > 500
* For these countries, we find the closest country in terms of per capita GDP in PPP terms according the The World Factbook of the CIA
* https://www.cia.gov/the-world-factbook/field/real-gdp-per-capita/country-comparison
* Last accessed: 10-05-2022
* The file CIA_world_factbook_10052022 in the folder shows the data.
foreach Y in 2012 2015 2019 2010s {
* For example, for Cuba, we use Moldova
gen temp = pcgdp`Y' if country_wb == "Moldova"
egen maxtemp = max(temp)
sum maxtemp
replace pcgdp`Y' = maxtemp if country_wb == "Cuba" & pcgdp`Y' == .
drop temp maxtemp
* Eritrea - We use Sierra Leone
gen temp = pcgdp`Y' if country_wb == "Sierra Leone"
egen maxtemp = max(temp)
sum maxtemp
replace pcgdp`Y' = maxtemp if country_wb == "Eritrea" & pcgdp`Y' == .
drop temp maxtemp
* French Polynesia - We use China
gen temp = pcgdp`Y' if country_wb == "China"
egen maxtemp = max(temp)
sum maxtemp
replace pcgdp`Y' = maxtemp if country_wb == "French Polynesia" & pcgdp`Y' == .
drop temp maxtemp
* Greenland - We use the United Kingdom 
gen temp = pcgdp`Y' if country_wb == "United Kingdom"
egen maxtemp = max(temp)
sum maxtemp
replace pcgdp`Y' = maxtemp if country_wb == "Greenland" & pcgdp`Y' == .
drop temp maxtemp
* Guam - We use Estonia
gen temp = pcgdp`Y' if country_wb == "Estonia"
egen maxtemp = max(temp)
sum maxtemp
replace pcgdp`Y' = maxtemp if country_wb == "Guam" & pcgdp`Y' == .
drop temp maxtemp
* Isle of Man - We use Bermuda
gen temp = pcgdp`Y' if country_wb == "Bermuda"
egen maxtemp = max(temp)
sum maxtemp
replace pcgdp`Y' = maxtemp if country_wb == "Isle of Man" & pcgdp`Y' == .
drop temp maxtemp
* Korea, Dem. People's Rep. - We use Eritrea
gen temp = pcgdp`Y' if country_wb == "Eritrea"
egen maxtemp = max(temp)
sum maxtemp
replace pcgdp`Y' = maxtemp if country_wb == "Korea, Dem. People's Rep." & pcgdp`Y' == .
drop temp maxtemp
* New Caledonia - We use Hungary
gen temp = pcgdp`Y' if country_wb == "Hungary"
egen maxtemp = max(temp)
sum maxtemp
replace pcgdp`Y' = maxtemp if country_wb == "New Caledonia" & pcgdp`Y' == .
drop temp maxtemp
* South Sudan - We use Sierra Leone
gen temp = pcgdp`Y' if country_wb == "Sierra Leone"
egen maxtemp = max(temp)
sum maxtemp
replace pcgdp`Y' = maxtemp if country_wb == "South Sudan" & pcgdp`Y' == .
drop temp maxtemp
* Syrian Arab Republic - We use Comoros
gen temp = pcgdp`Y' if country_wb == "Comoros"
egen maxtemp = max(temp)
sum maxtemp
replace pcgdp`Y' = maxtemp if country_wb == "Syrian Arab Republic" & pcgdp`Y' == .
drop temp maxtemp
* Taiwan - We use Curacao
gen temp = pcgdp`Y' if country_wb == "Curacao"
egen maxtemp = max(temp)
sum maxtemp
replace pcgdp`Y' = maxtemp if country_wb == "Taiwan" & pcgdp`Y' == .
drop temp maxtemp
* Venezuela, RB - We use Bolivia 
gen temp = pcgdp`Y' if country_wb == "Bolivia"
egen maxtemp = max(temp)
sum maxtemp
replace pcgdp`Y' = maxtemp if country_wb == "Venezuela, RB" & pcgdp`Y' == .
drop temp maxtemp
* Yemen, Rep. - We use the Solomon Islands
gen temp = pcgdp`Y' if country_wb == "Solomon Islands"
egen maxtemp = max(temp)
sum maxtemp
replace pcgdp`Y' = maxtemp if country_wb == "Yemen, Rep." & pcgdp`Y' == .
drop temp maxtemp
}
* We verify that pcgdp is not missing
codebook pcgdp* if sample == 1 & landareasqkm > 500
* We create the log of pcGDP and the labels
foreach X in 2012 2015 2019 2010s {
label var pcgdp`X' "GDP per capita, PPP, `X' (cst 2017 intl dol; WB-WDI)"
gen lpcgdp`X' = log(pcgdp`X')
label var lpcgdp`X' "Log GDP per capita, PPP, `X' (cst 2017 intl dol; WB-WDI)"
}
codebook pcgdp* if sample == 1
* Ultimately, 10 small countries do not have per capita GDP
* We ignore the fact that they have missing values. 
sort country_wb
save country3Dfull, replace

*******************************
*** WORLD BANK INCOME GROUP ***
*******************************

* World Bank classification of countries:
* Name of the original file: OGHIST
* Url: https://datahelpdesk.worldbank.org/knowledgebase/articles/906519-world-bank-country-and-lending-groups
* Last accessed 10-08-2022
* Name of the modified file to be imported in Stata: OGHIST2
import excel "OGHIST2.xlsx", sheet("Sheet1") firstrow clear
* We create it for several years
drop ccode
ren countryname country_wb 
ren y2010 incgroup2010
label var incgroup2010 "WB income group 2010"
ren y2015 incgroup2015
label var incgroup2015 "WB income group 2015"
ren y2019 incgroup2019
label var incgroup2019 "WB income group 2019"
* We change some country names to combine this data set with our main data set. 
replace country_wb = "Curacao" if country_wb == "Curaçao"
replace country_wb = "Cote d'Ivoire" if country_wb == "Côte d'Ivoire"
replace country_wb = "Faroe Islands" if country_wb == "Faeroe Islands"
replace country_wb = "Korea, Dem. People's Rep." if country_wb == "Korea, Dem. Rep."
replace country_wb = "Sao Tome and Principe" if country_wb == "São Tomé and Príncipe"
replace country_wb = "Taiwan" if country_wb == "Taiwan, China"
replace country_wb = "Turkiye" if country_wb == "Türkiye"
keep country_wb incgroup*
sort country_wb
save incgroup, replace

* We merge with the main data set *
use country3Dfull, clear
sort country_wb 
merge country_wb using incgroup
tab _m
drop if _m == 2
drop _m
save country3Dfull, replace

*************************
*** WORLD BANK REGION ***
*************************

* We add the World Bank region. 
* World Bank region
* Source: https://datatopics.worldbank.org/world-development-indicators/images/figures-png/world-by-region-map.pdf
* Last accessed 10-08-2022
use "wbregion.dta", clear
replace ccode = "IMN" if ccode == "IMY"
sort ccode
save "wbregion.dta", replace

* We merge with the main data set *
use country3Dfull, clear
sort ccode
merge ccode using "wbregion.dta"
tab _m
drop if _m == 2
drop _m
label var wbregion "World Bank region c. 2022"
* We create abbreviated versions of the regional names.
gen wbregionshort = "EAP" if wbregion == "East Asia & Pacific"
replace wbregionshort = "ECA" if wbregion == "Europe & Central Asia"
replace wbregionshort = "LAC" if wbregion == "Latin America & Caribbean"
replace wbregionshort = "MENA" if wbregion == "Middle East & North Africa"
replace wbregionshort = "NAM" if wbregion == "North America"
replace wbregionshort = "SAR" if wbregion == "South Asia"
replace wbregionshort = "SSA" if wbregion == "Sub-Saharan Africa"
tab wbregionshort, m
tab wbregionshort if sample == 1, m 
label var wbregionshort "World Bank region acronym c. 2022"
save country3Dfull, replace

********************
*** UN SUBREGION ***
********************

* We add the UN subregion name. 
use unsubregion_list, clear
replace country_wb = "Turkiye" if country_wb == "Turkey"
sort country_wb
save unsubregion_list2, replace

* We combine with the main data set. 
use country3Dfull, clear
sort country_wb
merge country_wb using unsubregion_list2
tab _m
bysort wbregion: tab country_wb if _m == 1
* We add the subregion when the match is imperfect.
replace unsubregion = "Caribbean" if wbregion == "Latin America & Caribbean" & _m == 1
tab country_wb if unsubregion == "" & wbregion != ""
bysort wbregion: tab country_wb if _m == 1
replace unsubregion = "Melanesia" if wbregion == "East Asia & Pacific" & _m == 1 & country_wb != "Macao SAR, China"
replace unsubregion = "Eastern Asia" if wbregion == "East Asia & Pacific" & _m == 1 & country_wb == "Macao SAR, China"
replace unsubregion = "North America" if country_wb == "Bermuda"
replace unsubregion = "Eastern Africa" if country_wb == "Seychelles"
bysort wbregion: tab country_wb if _m == 1 & unsubregion == ""
replace unsubregion = "Southern Europe" if country_wb == "Andorra"
replace unsubregion = "Southern Europe" if country_wb == "Gibraltar"
replace unsubregion = "Southern Europe" if country_wb == "San Marino"
replace unsubregion = "Western Europe" if country_wb == "Liechtenstein"
replace unsubregion = "Western Europe" if country_wb == "Monaco"
replace unsubregion = "Northern Europe" if country_wb == "Channel Islands"
replace unsubregion = "Northern Europe" if country_wb == "Faroe Islands"
replace unsubregion = "Northern Europe" if country_wb == "Greenland"
replace unsubregion = "Northern Europe" if country_wb == "Isle of Man"
drop _m
codebook unsubregion if wbregion != ""
save country3Dfull, replace

********************************************************
*** EXTRA VARIABLES AND PREPARING THE FINAL DATA SET ***
********************************************************

* We create a few extra variables and label them. 
use country3Dfull, clear

*** PER CAPITA 2D AND 3D STOCK MEASURES ***

* 2D = area = sq m per resident
gen pcstock2D = (stock2D)/(totpop)
label var pcstock2D "Area per cap 2019 (sq m per inh)"
* 3D = volume = cubic m per resident
* Old = v1 of WFS
* New = v2 of WFS
gen pcstock3D_old = (stock3D_old)/(totpop)
gen pcstock3D_new = (stock3D_new)/(totpop)
label var pcstock3D_old "Volume per cap 2019 (cubic m per inh; old v)"
label var pcstock3D_new "Volume per cap 2019 (cubic m per inh; new v)"

*** 2D-BASED VOLUME ***

* Volume implied by 2D and an average ceiling height of 2.5 meters
gen stock3D_2D = stock2D*2.5
label var stock3D_2D "Volume 2019 (cubic m per inh; based on 2D)"
* Per capita volume implied by 2D
gen pcstock3D_2D = (stock3D_2D)/(totpop)
label var pcstock3D_2D "Volume per cap 2019 (cubic m per inh; based on 2D)"

*** VOLUME 3D VS 2D ***

* We calculate the volume excluding the 2D-based volume (outward).
* This allows us to calculate the upward volume.  
gen stock3D_no2D = stock3D_new - stock3D_2D
label var stock3D_no2D "Volume 2012-2019 (cubic m; excl 2D vol)"
sum stock3D_no2D, d
* In per capita terms: 
gen pcstock3D_no2D = pcstock3D_new - pcstock3D_2D
label var pcstock3D_no2D "Volume per capita 2012-2019 (cubic m; excl 2D)"

*** HIGH-RISES VS LOW RISES ***

* Stock of high-rise (HR) vs low-rise (LR) heights
* Equal to new (v2: with Emporis) vs old (v1: without Emporis) 
gen stock3D_HR = stock3D_new-stock3D_old
sum stock3D_HR, d
* LR = old stock excluding Emporis since SAR-based data doesn't work well above 50 m. 
gen stock3D_LR = stock3D_old
sum stock3D_LR, d
label var stock3D_HR "Volume 2012-2019 (cubic m; high-rises)"
label var stock3D_LR "Volume 2012-2019 (cubic m; low-rises)"
* In per capita terms: 
gen pcstock3D_HR = (stock3D_HR)/(totpop)
gen pcstock3D_LR = (stock3D_LR)/(totpop)
label var pcstock3D_HR "Volume per cap 2019 (cubic m per inh; high-rises)"
label var pcstock3D_LR "Volume per cap 2019 (cubic m per inh; low-rises)"
* Same but excluding the 2D volume from LR
* This captures the upward volume from LR
gen stock3D_LRno2D = stock3D_LR - stock3D_2D
sum stock3D_LRno2D, d
label var stock3D_LRno2D "Volume 2012-2019 (cubic m; low-rises; excl 2D)"
gen pcstock3D_LRno2D = (stock3D_LRno2D)/(totpop)
label var pcstock3D_LRno2D "Volume per cap 2019 (cubic m per inh; low-rises; excl 2D)"

*** AVERAGE HEIGHT *** 

* We create the "average height" variables = volume divided by area
* Average height using volume and defined relative to total land area 
gen avght_new_div = stock3D_new/stock2D
corr avght_new avght_new_div
sum avght_new avght_new_div
label var avght_new_div "Volumer per area or avg height (m)"
* We use this new value from now and thus remove the older values that were initially created by the DLR team
drop avght_old avght_new lavght_old lavght_new Total_H_v1 Total_H_v2 Nr_Px lstock*

*** WE ORDER THE VARIABLES ***

order ccode country_wb Country_ID NAME GDP_2015 sample wbregion* incgroup* *pcgdp* landareasqkm *totpop* urbshare stock* pcstock* avght*

*** LOG OF THE VARIABLES ***

* We create logs of these variables.  
foreach X of varlist stock* pcstock* avght* {
gen l`X' = log(`X')
label var l`X' "Log of `X'"
}

*** CROWDING MEASURES ***

* We now create various "crowding" measures
* GFA = gross floor area
* GFA per capita 
* We assume an average ceiling height of 2.5 meters
sum pcstock3D_new, d
* The mean cubic m is about 250 cubic meters 
gen pcGFA3D_new = pcstock3D_new/2.5
sum pcGFA3D_new, d
* As a result, the mean GFA is about 100 sq meters
* We create the log and label variables. 
gen lpcGFA3D_new = log(pcGFA3D_new)
label var pcGFA3D_new "Per capita GFA (sq m; newer version)"
label var lpcGFA3D_new "Log per capita GFA (sq m; newer version)"

* We create the rank of the country in terms of population 2019
gsort- totpop
* We keep the X most populated countries for the label
gen rankpop = _n
label var rankpop "Country rank in terms of population c. 2019"

* We create the rank of the country in terms of GDP 2019
gen gdp_2019 = totpop*pcgdp2019
gsort- gdp_2019
* We keep the X most populated countries for the label
gen rankgdp = _n
label var rankgdp "Country rank in terms of total GDP c. 2019"
drop gdp_2019

save country3Dfull, replace

**************************************************
*** DATA FOR PREDICTING THE CONSTRUCTION NEEDS ***
**************************************************

*** REAL GDP GROWTH 2019 2020 2021 ***

* Due to COVID-19, we predict real GDP growth from the viewpoint of 2019
* The source for the growth rates is:
* World Bank. 2019. Global Economic Prospects, June 2019: Heightened Tensions, Subdued Investment. Washington, DC: World Bank. doi: 10.1596/978-1-4648-1398-6. License: Creative Commons Attribution CC BY 3.0 IGO.
* Source file: Global-Economic-Prospects-2019-GDP-growth-data
* Modified file so as to import it in Stata: Global-Economic-Prospects-2019-GDP-growth-data2

clear
import excel "Global-Economic-Prospects-2019-GDP-growth-data2.xlsx", sheet("Sheet1") firstrow
drop if country_wb == ""
gen country_gep = country_wb
sum y2019f
* We start from the viewpoint of 2019 (pre-COVID)
gen gr19 = 1+y2019f/100
sum gr19
gen gr20 = 1+y2020f/100
sum gr20
gen gr21 = 1+y2021f/100
sum gr21
gen gr1921 = gr19*gr20*gr21
sum gr1921
* We create the average growth rate 2019-2021
gen avggr1921 = (gr1921)^(1/3)
sum avggr1921, d
keep country_gep avggr1921 gr1921
* We change some country names. 
replace country_gep = "Cote d'Ivoire" if country_gep == "Côte d'Ivoire"
replace country_gep = "Egypt, Arab Rep." if country_gep == "Egypt"
replace country_gep = "Iran, Islamic Rep." if country_gep == "Iran"
replace country_gep = "Timor-Leste" if country_gep == "Timor Leste"
replace country_gep = "Turkiye" if country_gep == "Turkey"
sort country_gep
save predicted_gdp_growth, replace

* We merge with the main data set
use country3Dfull, clear
gen country_gep = country_wb
sort country_gep
merge country_gep using predicted_gdp_growth
tab _m
tab country_gep if _m == 2
tab country_gep if _m == 1 & wbregionshort == "ECA"
* We use the "Euro area" estimate for these countries
replace country_gep = "Euro area" if _m == 1 & wbregionshort == "ECA"
drop _m
* Now that we have all the country/region names, we re-merge the data  
sort country_gep
merge country_gep using predicted_gdp_growth, update
tab _m
drop _m
* Based on this growth rate, we predict pcGDP in 2025 (abstracting from population growth)
sum gr1921 [w=totpop]
drop gr1921
gen pcgdp2025_inc = pcgdp2019*(avggr1921^6) 
gen lpcgdp2025_inc = log(pcgdp2025_inc)
* Variables capturing the increase from 2019-2025
gen chglpcgdp1925_inc = lpcgdp2025_inc - lpcgdp2019
sum chglpcgdp1925_inc, d
sum chglpcgdp1925_inc [w=totpop]
* We label the variables. 
label var lpcgdp2025_inc "Log of pcgdp2025_inc"
label var pcgdp2025_inc "Predicted pcGDP 2025, inc gr (cst 2017 intl dol; WB-WDI)"
label var chglpcgdp1925_inc "Change in lpcgdp 2019-2025, inc gr (cst 2017 intl dol; WB-WDI)"
drop country_gep
save country3Dfull, replace

*** POPULATION GROWTH 2020-2030 ***

* We add the estimates of future population. 
* World Population Prospects of the United Nations, The 2019 Revision
* Url: https://population.un.org/wpp/Download/Archive/Standard/
* Last accessed: 10-12-2022
import excel "Copy of WPP2019_POP_F01_1_TOTAL_POPULATION_BOTH_SEXES2.xlsx", sheet("Sheet1") firstrow clear
drop if code >= 900 & code != .
keep country y2020 y2025 y2030 y2050
replace y2020 = "" if y2020 == "..."
replace y2025 = "" if y2025 == "..."
replace y2030 = "" if y2030 == "..."
replace y2050 = "" if y2050 == "..."
destring y2020 y2025 y2030 y2050, replace
* We have the future population in 2025 and in 2030. 
ren y2025 totpop2025
ren y2030 totpop2030
ren y2050 totpop2050
keep country totpop2025 totpop2050
* We change some country names to match our main database  
replace country = "Bahamas, The" if country == "Bahamas"
replace country = "Bolivia" if country == "Bolivia (Plurinational State of)"
replace country = "Congo, Dem. Rep." if country == "Democratic Republic of the Congo"
replace country = "Cote d'Ivoire" if country == "Côte d'Ivoire"
replace country = "Czech Republic" if country == "Czechia"
replace country = "Egypt, Arab Rep." if country == "Egypt"
replace country = "Gambia, The" if country == "Gambia"
replace country = "Korea, Rep." if country == "Republic of Korea"
replace country = "Kyrgyz Republic" if country == "Kyrgyzstan"
replace country = "Lao PDR" if country == "Lao People's Democratic Republic"
replace country = "Micronesia, Fed. Sts." if country == "Micronesia (Fed. States of)"
replace country = "Moldova" if country == "Republic of Moldova"
replace country = "Slovak Republic" if country == "Slovakia"
replace country = "Tanzania" if country == "United Republic of Tanzania"
replace country = "United States" if country == "United States of America"
replace country = "Venezuela, RB" if country == "Venezuela (Bolivarian Republic of)"
replace country = "Vietnam" if country == "Viet Nam"
replace country = "West Bank and Gaza" if country == "State of Palestine"
replace country = "Yemen, Rep." if country == "Yemen"
replace country = "Eswatini" if country == "Swaziland" 
replace country = "Congo, Rep." if country == "Congo" 
replace country = "Hong Kong SAR, China" if country == "China, Hong Kong SAR" 
replace country = "Iran, Islamic Rep." if country == "Iran (Islamic Republic of)" 
replace country = "Korea, Dem. People’s Rep." if country == "Dem. People's Republic of Korea" 
replace country = "Macao SAR, China" if country == "China, Macao SAR" 
replace country = "North Macedonia" if country == "TFYR Macedonia" 
replace country = "Republic of China (Taiwan)" if country == "China, Taiwan Province of China" 
replace country = "Curacao" if country == "Curaçao" 
replace country = "Falkland Islands" if country == "Falkland Islands (Malvinas)" 
replace country = "Vatican City" if country == "Holy See" 
replace country = "Korea, Dem. People's Rep." if country == "Korea, Dem. People’s Rep." 
replace country = "Taiwan" if country == "Republic of China (Taiwan)" 
replace country = "Reunion" if country == "Réunion" 
replace country = "Saint-BarthÃ©lemy" if country == "Saint Barthélemy" 
replace country = "St. Kitts and Nevis" if country == "Saint Kitts and Nevis" 
replace country = "St. Lucia" if country == "Saint Lucia" 
replace country = "St. Martin (French part)" if country == "Saint Martin (French part)" 
replace country = "St. Vincent and the Grenadines" if country == "Saint Vincent and the Grenadines" 
replace country = "Turkiye" if country == "Turkey" 
replace country = "Virgin Islands (U.S.)" if country == "United States Virgin Islands" 
replace country = "" if country == "Wallis and Futuna Islands" 
ren country country_wb 
sort country_wb
save futurepop, replace

* We merge with the main data set 
use country3Dfull, clear
sort country_wb
merge country_wb using futurepop
tab _m
tab country_wb if _m == 2
tab country_wb if _m == 1 
drop _m
* Predicting total GDP 2025
gen gdp2025 = (pcgdp2019*totpop)*(avggr1921^6) 
sum avggr1921
sum totpop totpop2025 if ccode == "FRA"
* We express total population 2025 in inhabitants
replace totpop2025 = totpop2025*1000
sum totpop totpop2025 if ccode == "FRA"
sum totpop totpop2025 if ccode == "NGA"
* We express total population 2050 in inhabitants
replace totpop2050 = totpop2050*1000
sum totpop totpop2050 if ccode == "FRA"
sum totpop totpop2050 if ccode == "NGA"
* We then obtain pcGDP in 2030
gen gdp2015 = (pcgdp2019*totpop)
gen pcgdp2025_incpop = gdp2025/totpop2025 
drop gdp2025
sum pcgdp2025_incpop, d
gen lpcgdp2025_incpop = log(pcgdp2025_incpop)
* Increase 2019-2025
sum lpcgdp2019 lpcgdp2025_incpop [w=totpop]
* we check that the values are indeed higher in 2022
sum avggr1921, d 
gen chglpcgdp1925_incpop = lpcgdp2025_incpop - lpcgdp2019
count if chglpcgdp1925_inc < 0 
sum chglpcgdp1925_inc chglpcgdp1925_incpop [w=totpop]
drop gdp2015 
* We label the variables. 
label var avggr1921 "Predicted avg. pc GDP growrth rate 2019-2021"
label var totpop2025 "Total pop (inh.) in 2025"
label var lpcgdp2025_incpop "Log of pcgdp2025_inc"
label var totpop2050 "Total pop (inh.) in 2050"
label var pcgdp2025_incpop "Predicted pcGDP 2025, inc-pop gr (cst 2017 intl dol; WB-WDI)"
label var chglpcgdp1925_incpop "Change in lpcgdp 2019-2025, inc-pop gr (cst 2017 intl dol; WB-WDI)"
save country3Dfull, replace

**********************
*** EXTRA DATA SET ***
**********************

* This data set includes the total population of each UN subregion today
* We need it later for some analyses. 
* Total pop of each UN subregion
use country3Dfull, clear
keep if sample == 1 & lpcgdp2010s != .
count
* 204
collapse (sum) totpop, by(unsubregion)
sort unsubregion
label var totpop "Total pop (inh) of each UN subregion"
save totpopsubregion, replace

***********************************************************************************************************

***********************
***********************
*** CITY-LEVEL DATA ***
***********************
***********************

*******************************************************
*** COUNTRY INFORMATION FOR THE CITY-LEVEL ANALYSIS ***
*******************************************************

* We extract some country-level variables for the city-level analysis
use country3Dfull, clear
drop if country_wb == ""
* These are the variables we keep.
* We rename some of them. 
keep Country_ID lpcgdp2010s country_wb *region* rank* ccode incgroup2019 
ren rankpop rankpop_cntry
ren rankgdp rankgdp_cntry
tab country_wb if Country_ID == ""
drop if Country_ID == ""
sort Country_ID
save country_info_for_cities, replace
codebook Country_ID
* 235

**************************
*** CITY-LEVEL 3D DATA ***
**************************

* This is the volume data extracted at the city level.
* We use the "urban centres" (UCs) of the GHS database. 
clear
import excel "UC_Stats_17102022.xlsx", sheet("Merged") clear firstrow
count
* 13,067
* 2 observations are just empty lines. We drop them. 
drop if Country_ID == ""
count
* 13,065
* We first add the country-level information.
sort Country_ID
merge Country_ID using country_info_for_cities
tab _m
tab Country_ID if _m == 1
drop if _m == 2
* These are countries without UCs. 
drop _m
count
* 13,065
sort Country_ID
ren UC_NM_MN cityname
* We change some country and city names and codes. 
* We rename San Diego that appears as Tijuana in the GHS database.
replace cityname = "San Diego" if cityname == "Tijuana" & country_wb == "United States"
* We separate Hong Kong and Macao.
tab cityname if UC_ID == "UCID12291" | UC_ID == "UCID12281" | UC_ID == "UCID12274"
replace country_wb = "Hong Kong SAR, China" if UC_ID == "UCID12291" | UC_ID == "UCID12281" | UC_ID == "UCID12274"
replace ccode = "HKG" if UC_ID == "UCID12291" | UC_ID == "UCID12281" | UC_ID == "UCID12274"
* In Macao, Zhuhai includes Macao SAR.
* Since the GHS boundary includes Zhuhai, we include its whole territory in Macao.
tab cityname if UC_ID == "UCID12211" | UC_ID == "UCID12240"
replace country_wb = "Macao SAR, China" if UC_ID == "UCID12211" | UC_ID == "UCID12240"
replace ccode = "MAC" if UC_ID == "UCID12211" | UC_ID == "UCID12240"
* The GDP values = 0 are actually missing
replace GDP_2015 = . if GDP_2015 == 0
* We rename some variables. 
ren Total_2D stock2D
* V1 = old version
* V2 = new version
ren Av_H_v1 avght_old
ren Av_H_v2 avght_new
ren Total_V_v1 stock3D_old
ren Total_V_v2 stock3D_new
* The GDP values = 0 are actually missing
replace GDP_2015 = . if GDP_2015 == 0
* We label the variables.
label var GDP_2015 "Total GDP 2015 based on night lights"
label var stock2D "Built-up area 2012-2019 (sq m)"
label var stock3D_old "Volume 2012-2019 (cubic m; excl tall buildings)"
label var stock3D_new "Volume 2012-2019 (cubic m; incl tall buildings)"
label var avght_old "Average height 2012-2019 (m; excl tall buildings)"
label var avght_new "Average height 2012-2019 (m; incl tall buildings)"
foreach X of varlist stock* avght* {
gen l`X' = log(`X')
label var l`X' "Log of `X'"
}
sort UC_ID
save city3D, replace

********************************
*** ADDING THE GHS VARIABLES ***
********************************

* We add the main GHS variables that we need for the analysis.  
clear
import excel "UC_Stats_17102022.xlsx", sheet("GHS_Info") clear firstrow
keep UC_ID QA2_1V GCPNT* CTR_MN_NM UC_NM_MN P75-P15 AREA
sort UC_ID
save city_ghs, replace

* We merge with the main data set. 
use city3D, clear
sort UC_ID
merge UC_ID using city_ghs
tab _m
sort _m
drop _m
tab QA2_1V, m
* We drop the "false positives" according to GHS (i.e., localities falsely idenfitied as cities)
drop if QA2_1V == 0
count
* 12,831
sort UC_ID
sum GDP_2015
sort UC_ID
* We use the log of these variables. 
foreach X in P75 P90 P00 P15 {
gen l`X' = log(`X')
label var l`X' "Log of `X'"
}
gen lGDP_2015 = log(GDP_2015)
label var GDP_2015 "GDP 2015 (Real GDP in mil 2017 USD)"
label var lGDP_2015 "Log GDP 2015 (Real GDP in mil 2017 USD)"
save city3Dfull, replace
count
* This is the final data set. It has 12,831 cities.

***********************
*** EXTRA VARIABLES ***
***********************

* We add some variables. 

use city3Dfull, clear

*** POPULATION ***

gen totpop = P15
label var totpop "Total pop of the city in 2015"

*** CITY PER CAPITA GDP ***

gen lpcGDP2015 = log(GDP_2015/1000)/log(P15)
label var lpcGDP2015 "Log city GDP billion 2015 (lights) / log city pop 2015 (GHS), centered"
sum lpcGDP2015, d
* We center it
replace lpcGDP2015 = lpcGDP2015+1.024572 
sum lpcGDP2015, d

*** PER CAPITA STOCK MEASURES ***

* sq m per resident
gen pcstock2D = (stock2D)/(totpop)
label var pcstock2D "Area per cap 2019 (sq m per inh)"
* cubic m per resident
gen pcstock3D_old = (stock3D_old)/(totpop)
gen pcstock3D_new = (stock3D_new)/(totpop)
label var pcstock3D_old "Volume per cap 2019 (cubic m per inh; old v)"
label var pcstock3D_new "Volume per cap 2019 (cubic m per inh; new v)"

*** VOLUME 2D ***

* Volume implied by 2D and average ceiling height of 2.5 meters
gen stock3D_2D = stock2D*2.5
label var stock3D_2D "Volume 2019 (cubic m per inh; based on 2D)"
gen pcstock3D_2D = (stock3D_2D)/(totpop)
label var pcstock3D_2D "Volume per cap 2019 (cubic m per inh; based on 2D)"

*** VOLUME 3D VS 2D ***

* Volume excluding the 2D-based volume. 
gen stock3D_no2D = stock3D_new - stock3D_2D
label var stock3D_no2D "Volume 2012-2019 (cubic m; excl 2D vol)"
sum stock3D_no2D, d
gen pcstock3D_no2D = pcstock3D_new - pcstock3D_2D
label var pcstock3D_no2D "Volume per capita 2012-2019 (cubic m; excl 2D)"

*** HIGH-RISES VS LOW RISES ***

* Stock of high-rise (HR) vs low-rise (LR) heights
* Equal to new (with Emporis) vs old (without Emporis) 
gen stock3D_HR = stock3D_new-stock3D_old
sum stock3D_HR, d
gen stock3D_LR = stock3D_old
sum stock3D_LR, d
label var stock3D_HR "Volume 2012-2019 (cubic m; high-rises)"
label var stock3D_LR "Volume 2012-2019 (cubic m; low-rises)"
gen pcstock3D_HR = (stock3D_HR)/(totpop)
gen pcstock3D_LR = (stock3D_LR)/(totpop)
label var pcstock3D_HR "Volume per cap 2019 (cubic m per inh; high-rises)"
label var pcstock3D_LR "Volume per cap 2019 (cubic m per inh; low-rises)"
* Same but excluding the 2D volume from LR
gen stock3D_LRno2D = stock3D_LR - stock3D_2D
sum stock3D_LRno2D, d
label var stock3D_LRno2D "Volume 2012-2019 (cubic m; low-rises; excl 2D)"
gen pcstock3D_LRno2D = (stock3D_LRno2D)/(totpop)
label var pcstock3D_LRno2D "Volume per cap 2019 (cubic m per inh; low-rises; excl 2D)"

*** AVERAGE HEIGHT *** 

* Average height using volume and defined relative to total land area
gen avght_new_div = stock3D_new/stock2D
corr avght_new avght_new_div
* 0.98
sum avght_new avght_new_div
label var avght_new_div "Volumer per area or avg height (m)"
* We use this new value from now and thus remove the older values
drop avght_old avght_new lavght_old lavght_new Nr_Px lstock*

*** LOG OF THE VARIABLES ***

* Logs of these variables. 
foreach X of varlist stock* pcstock* avght* {
gen l`X' = log(`X')
label var l`X' "Log of `X'"
}

*** CROWDING MEASURES ***

* GFA per capita 
* We assume an average height of 2.5 meters
sum pcstock3D_new, d
* Mean = about 250 cubic meters 
gen pcGFA3D_new = pcstock3D_new/2.5
sum pcGFA3D_new, d
* Mean GFA about 100 sq meters
gen lpcGFA3D_new = log(pcGFA3D_new)
label var pcGFA3D_new "Per capita GFA (sq m; newer version)"
label var lpcGFA3D_new "Log per capita GFA (sq m; newer version)"

*** RANK ***
gsort- P15
gen rank_pop15 = _n
label var rank_pop15 "Rank pop in 2015"

save city3Dfull, replace


